The goal of this project is the investigate what causes Serious and Fatal accidents in hopes of preventing and decreasing the number of them. The dataset consists of accident records from the UK over the course of 15+ years. I hope to show the causes of these accidents through visualizations and create an algorithm that can predict the severity of accidents.
The UK government collects and publishes (usually on an annual basis) detailed information about traffic accidents across the country. This information includes, but is not limited to, geographical locations, weather conditions, type of vehicles, number of casualties and vehicle manoeuvres, making this a very interesting and comprehensive dataset for analysis and research.
The data that I'm using is compiled and available through Kaggle and in a less compliled form, here.
Genesis L. Taylor
Github | Linkedin | Tableau | genesisltaylor@gmail.com
Problem: Traffic Accidents
Solution Method: Use data to figure out how to lower the number of accidents and the severity of them.
UK Road Safety: Traffic Accidents and Vehicles Introduction, Data Cleaning, and Feature Manipulation
UK Road Safety: Traffic Accidents and Vehicles Introduction, Data Cleaning, and Feature Manipulation: Github Link
UK Road Safety: Traffic Accidents and Vehicles Visualizations and Solution
UK Road Safety: Traffic Accidents and Vehicles Visualizations and Solution: Github Link
UK Road Safety: Traffic Accidents and Vehicles Machine Learning
UK Road Safety: Traffic Accidents and Vehicles Machine Learning: Github Link
Traffic Analysis and Severity Prediction Powerpoint Presentation
Traffic Analysis and Severity Prediction Powerpoint Presentation: Github Link
#Import modules
import numpy as np
import holidays
import pandas as pd
import seaborn as sns
import pickle
import time
import timeit
import matplotlib.pyplot as plt
plt.style.use('dark_background')
%matplotlib inline
import datetime
import math
from collections import Counter
#scipy
import scipy.stats as stats
from scipy import stats
from scipy.stats import chi2_contingency
#sklearn
import sklearn
from sklearn import ensemble
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, log_loss, recall_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
#for clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
#other learners
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from kmodes.kmodes import KModes
#imblearn
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
#webscraping
import requests
from bs4 import BeautifulSoup
import re
import urllib
from IPython.core.display import HTML
#time series
import statsmodels.api as sm
from pylab import rcParams
import itertools
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima_model import ARIMA
#warning ignorer
import warnings
warnings.filterwarnings("ignore")
# # #DATAFRAME PICKLE CREATED IN CELLS BELOW INSTEAD OF RUNNING THROUGH ENTIRE PROCESS AFTER RESTARTING
# # #import pickled file
df = pd.read_pickle("df.pkl")
# df.to_csv('uktraffic_acc.csv')
#import files
ac = pd.read_csv(r'Accident_Information.csv', low_memory=False, chunksize=30000)
vc = pd.read_csv(r'Vehicle_Information.csv', low_memory=False, chunksize=30000)
Previously, I did not remove "Data missing or out of range" from the datasets however through cleaning and checking the value counts I decided to do so for sanity purposes only. Most of the percentages that had this as a value were not a high percentage either.
#chunk cleaning and dataframing for accident column
acchunk = []
for chunk in ac:
acchunk_filter = chunk[
(chunk.Year.astype(int) >= 2010) &
(chunk.Year.astype(int) <= 2017) &
(chunk['Road_Type'] != "Unknown") &
(chunk['Junction_Control'] != "Data missing or out of range") &
(chunk['Carriageway_Hazards'] != "Data missing or out of range") &
(chunk['Junction_Detail'] != "Data missing or out of range") &
(chunk['Road_Surface_Conditions'] != "Data missing or out of range") &
(chunk['Special_Conditions_at_Site'] != "Data missing or out of range") &
(chunk['Weather_Conditions'] != "Data missing or out of range") &
(chunk['Latitude'].notnull()) &
(chunk['Longitude'].notnull())
]
acchunk.append(acchunk_filter)
df1 = pd.concat(acchunk)
#chunk cleaning for vehicles column
vcchunk = []
for chunk2 in vc:
vcchunk_filter = chunk2[
(chunk2.Year.astype(int) >= 2010)&
(chunk2.Year.astype(int) <= 2017) &
(chunk2['Driver_Home_Area_Type'] != "Data missing or out of range") &
(chunk2['Journey_Purpose_of_Driver'] != "Data missing or out of range") &
(chunk2['Junction_Location'] != "Data missing or out of range") &
(chunk2['Was_Vehicle_Left_Hand_Drive'] != "Data missing or out of range") &
(chunk2['Hit_Object_in_Carriageway'] != "Data missing or out of range") &
(chunk2['Skidding_and_Overturning'] != "Data missing or out of range") &
(chunk2['Towing_and_Articulation'] != "Data missing or out of range") &
(chunk2['Vehicle_Leaving_Carriageway'] != "Data missing or out of range") &
(chunk2['Vehicle_Manoeuvre'] != "Data missing or out of range") &
(chunk2['Vehicle_Type'] != "Data missing or out of range") &
(chunk2['X1st_Point_of_Impact'] != "Data missing or out of range") &
(chunk2['Sex_of_Driver'] != "Data missing or out of range") &
(chunk2['Age_Band_of_Driver'] != "Data missing or out of range")
]
vcchunk.append(vcchunk_filter)
df2 = pd.concat(vcchunk)
#check columns
print("Accident's Columns:\n",df1.columns, "\n")
print("Vehicle's Columns:\n",df2.columns)
print('Accident Shape', df1.shape)
print('Vehicle Shape',df2.shape)
#merge dataframes
df = pd.merge(df1,df2)
#check columns
print("Names of Combined Columns:\n",df.columns, "\n")
print("\nShape:\n",df.shape)
df.describe(include ='all')
#check corr b/t Location_Easting_OSGR & Location_Northing_OSGR AND Longitude and Latitude
print(df['Location_Easting_OSGR'].corr(df['Longitude']))
print(df['Location_Northing_OSGR'].corr(df['Latitude']))
#drop Location_Easting_OSGR & Location_Northing_OSGR
#because they are the similar to Latitude and Longitude
df = df.drop(['Location_Easting_OSGR', 'Location_Northing_OSGR'], axis=1)
df.shape
#standardize all column names to lowercase, and remove some characters
#for ease of use in querying
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace('.','')
df.columns = df.columns.str.replace('(','')
df.columns = df.columns.str.replace(')','')
#convert date/time to datetime datatype
df['date'] = pd.to_datetime((df['date']), format= "%Y-%m-%d")
#df.dtypes
#mistyped datatypes
df[['did_police_officer_attend_scene_of_accident',
'driver_imd_decile','vehicle_reference',
'vehicle_locationrestricted_lane','1st_road_number',
'2nd_road_number','driver_imd_decile',
'pedestrian_crossing-physical_facilities',
'pedestrian_crossing-human_control']]= df[['did_police_officer_attend_scene_of_accident',
'driver_imd_decile','vehicle_reference',
'vehicle_locationrestricted_lane','1st_road_number',
'2nd_road_number','driver_imd_decile',
'pedestrian_crossing-physical_facilities',
'pedestrian_crossing-human_control']].astype('object')
df.columns.to_series().groupby(df.dtypes).groups
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
# #2nd_road_class
df['2nd_road_class'].value_counts()/df.shape[0]*100
With 40% of non null being unclassified and 39% of the overall 2nd_road_class column being null, I have decided to drop it in it's entirely.
df = df.drop(['2nd_road_class'], axis=1)
#driver_imd_decile
df['driver_imd_decile'].value_counts()/df.shape[0]*100
Since the distribution of categories for 'driver_imd_decile seem very similar, I've decided not to use the mode but "method='ffill'"
df['driver_imd_decile'].fillna(method='ffill', inplace=True)
df['age_of_vehicle'].describe()
df['age_of_vehicle'].median()
Changing the nulls of "age of vehicle" to median, then creating it as a category
#fillna by 7
df['age_of_vehicle'].fillna(7, inplace=True)
#group age_of_vehicle
#1=0-3, 2=3-5, 3=5-8, 4=8-11, 5=
def fixedvehicleage(age):
if age>=0 and age<=120:
return age
else:
return np.nan
df['age_of_vehicle'] = df['age_of_vehicle'].apply(fixedvehicleage)
df['age_of_vehicle'] = pd.cut(df['age_of_vehicle'],
[0,2,5,8,11,14,17,120], labels=['1', '2', '3','4','5','6','7'])
#model
df['model'].value_counts()/df.shape[0]*100
df['model'].describe()
Knowing that there are 28824 unique models for the model column I have decided to use the ffill method on it as well.
df['model'].fillna(method='ffill', inplace=True)
Note: A lot of the values of "model' are labeled as "missing". I do not want to change these because the model could have actually been missing from the car from the accident or it could not be recognizable at the time of the accident.
#engine_capacity_cc
df['engine_capacity_cc'].describe()
I am going to handle both outliers and the null values of engine_capacity_cc using the ideals of quantiles and the interquartile range (IQR).
#first I'm going to handle both ends of outliers.
#(determine the min and max cuttoffs for detecting the outlier)
q75, q25 = np.percentile(df['engine_capacity_cc'].dropna(), [75 ,25])
iqr = q75 - q25
ecmin = q25 - (iqr*1.5)
ecmax = q75 + (iqr*1.5)
print(ecmax)
print(ecmin)
To explain, what I am going to do is use the ecmax number for the maximum engine_capacity_cc and ecmin for my engine_capacity_cc. Then I'm going to take the mean of those and use it as my fillna.
df = df[df['engine_capacity_cc']<=ecmax]
df = df[df['engine_capacity_cc']>=ecmin]
df['engine_capacity_cc'].hist(bins=20)
plt.style.use('dark_background')
I can accept this distribution and will now check and handle their nulls
#check values of 'engine_capacity_cc'
df['engine_capacity_cc'].describe()
df['engine_capacity_cc'].mean()
Going to round this mean value
df['engine_capacity_cc'].fillna(1652, inplace=True)
Note: After doing the above null fixes, propulsion_code dropped from having 10% null values to 0. (see below). I will continue on and fix lsoa_of_accident_location then drop the rest of the null values with are all <5%.
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
# #lsoa_of_accident_location
df['lsoa_of_accident_location'].value_counts()
df['lsoa_of_accident_location'].describe()
With 35061 unique variable and a high count amount the top variables I am deciding to do ffill again.
df['lsoa_of_accident_location'].fillna(method='ffill', inplace=True)
#### Check nulls for again
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
Dropping the remaining nulls that are <1%.
#drop the remaining nulls that are <1%
df.dropna(inplace=True)
#last check
df.isnull().sum().sort_values(ascending=False)/df.shape[0]*100
df.shape
df.info()
#detecting outliers of numerical columns (all floats/ints excluding lat/long and year)
df_num = df[['engine_capacity_cc','number_of_casualties','number_of_vehicles','speed_limit']]
df_num.hist( bins=25, grid=False, figsize=(12,8))
plt.style.use('dark_background')
Column 'speed_limit' seems ok and was previously altered 'engine_capacity_cc'. However, 'number_of_casualties', and 'number_of_vehicles',will be evaluated.
# #number_of_casualties
df['number_of_casualties'].value_counts()
#create casualities grouping
def casualities(num_cas):
if num_cas >=1 and num_cas <2:
return "1"
elif num_cas >=2 and num_cas <3:
return "2"
elif num_cas >=3 and num_cas <4:
return "3"
elif num_cas >= 4 and num_cas <5:
return "4"
elif num_cas >= 5:
return "5+"
#apply function
df['number_of_casualties']= df['number_of_casualties'].apply(casualities)
#number_of_casualties
df['number_of_casualties'].value_counts()
df['propulsion_code'].value_counts()/df.shape[0]*100
#Clean the values for Propulsion Code.
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas", value="Petrol")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas/Bi-fuel", value="Bio-fuel")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Petrol/Gas (LPG)", value="LPG Petrol")
df['propulsion_code'] = df['propulsion_code'].replace(to_replace="Gas Diesel", value="Diesel")
df['propulsion_code'].value_counts()/df.shape[0]*100
# #unique values
df.nunique().sort_values(ascending=False)
df['date'] = pd.to_datetime(df['date'])
df['month'] = df ['date'].apply(lambda time: time.month)
#creating a weekend feature that includes Friday-Sunday
df['weekend']= np.where(df['day_of_week'].isin(['Friday', 'Saturday', 'Sunday']), 1, 0)
#create time of day feature with Morning Rush, Day, Noon Rush, Afternoon, After Work Rush, Night
#time of day dictionary
timeofdaygroups = {1: "Morning Rush (6-10)",
2: "Day (10-12)",
3: "Lunch Rush (12-14)",
4: "Afternoon (14-16)",
5: "After Work Rush (16-18)",
6: "Evening (18-22)",
7: "Night (22-6)"}
#pull time data and create hour column
df['hour'] = df['time'].str[0:2]
#convert to numeric
df['hour'] = pd.to_numeric(df['hour'])
#convert to integer
df['hour'] = df['hour'].astype('int')
#create time_of_day grouping
def daygroup(hour):
if hour >= 6 and hour < 10:
return "1"
elif hour >= 10 and hour < 12:
return "2"
elif hour >= 12 and hour < 14:
return "3"
elif hour >= 14 and hour < 16:
return "4"
elif hour >= 16 and hour < 18:
return "5"
elif hour >= 18 and hour < 22:
return "6"
else:
return "7"
#apply function
#time of day function
df['time_of_day']= df['hour'].apply(daygroup)
df[['weekend','day_of_week','time', 'time_of_day']].tail(10)
#vehicle_type
df['vehicle_type'].value_counts()/df.shape[0]*100
I want to condense the vehicle type variables.
#motorcycles
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle over 500cc",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Motorcycle over 125cc and up to 500cc",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle 125cc and under",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle 50cc and under",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Electric motorcycle",
value="Motorcycle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Motorcycle - unknown cc",
value="Motorcycle")
#Goods_vehicle
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Van / Goods 3.5 tonnes mgw or under",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods over 3.5t. and under 7.5t",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods vehicle - unknown weight",
value="Goods Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Goods 7.5 tonnes mgw and over",
value="Goods Vehicle")
#car
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Taxi/Private hire car",
value="Car")
#bus
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Minibus (8 - 16 passenger seats)",
value="Bus")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace=
"Bus or coach (17 or more pass seats)",
value="Bus")
#other vehicle
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Agricultural vehicle",
value="Other Vehicle")
df['vehicle_type'] = df['vehicle_type'].replace(to_replace="Other vehicle",
value="Other Vehicle")
#vehicle_type
df['vehicle_type'].value_counts()/df.shape[0]*100
Create more condense groups for age band of driver in order to deal with some potential outliers.
#age_band_of_driver
df['age_band_of_driver'].value_counts()/df.shape[0]*100
#I did this before hand because as "Over 75", it wouldnt convert in the codes below
df['age_band_of_driver']=df['age_band_of_driver'].replace("Over 75","75-100")
age1 = ["0 - 5", "6 - 10", "11 - 15"]
age2 = ["16 - 20","21 - 25"]
age3 = ["26 - 35","36 - 45"]
age4 = ["46 - 55", "56 - 65"]
age5 = ["66 - 75", "75-100"]
#over 75 wouldnt work in the string so I did it separately
for (row, col) in df.iterrows():
if str.lower(col.age_band_of_driver) in age1:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='Under 16', inplace=True)
if str.lower(col.age_band_of_driver) in age2:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='16-25', inplace=True)
if str.lower(col.age_band_of_driver) in age3:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='26-45', inplace=True)
if str.lower(col.age_band_of_driver) in age4:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='46-65', inplace=True)
if str.lower(col.age_band_of_driver) in age5:
df['age_band_of_driver'].replace(to_replace=col.age_band_of_driver,
value='Over 65', inplace=True)
#age_band_of_driver
print("Distinct responses for age_band_of_driver:\n", set(df['age_band_of_driver']))
# number_of_vehicles
df['number_of_vehicles'].value_counts()/df.shape[0]*100
#group number_of_vehicles
def vehicles(num_veh):
if num_veh >=1 and num_veh <2:
return "1"
elif num_veh >=2 and num_veh <3:
return "2"
elif num_veh >=3 and num_veh <4:
return "3"
elif num_veh >= 4:
return "4+"
#apply function
df['number_of_vehicles']= df['number_of_vehicles'].apply(vehicles)
# number_of_vehicles
df['number_of_vehicles'].value_counts()/df.shape[0]*100
df['number_of_vehicles'].dtypes
df['number_of_vehicles']=df['number_of_vehicles'].astype('object')
#creating seasons column for ML
#creating season column
def getSeason(month):
if (month == 12 or month == 1 or month == 2):
return "winter"
elif(month == 3 or month == 4 or month == 5):
return "spring"
elif(month == 6 or month== 7 or month == 8):
return "summer"
else:
return "fall"
df['season'] = df['month'].apply(getSeason)
# number_of_vehicles
df['season'].value_counts()/df.shape[0]*100
#go back to engine capacity CC and crete groups
df.engine_capacity_cc.hist()
def enginecap(eng_cc):
if eng_cc <=1500:
return "small engine cc"
if eng_cc >1500 and eng_cc <=2000:
return "medium engine cc"
if eng_cc >2000:
return "large engine cc"
df['engine_capacity_cc_size'] = df['engine_capacity_cc'].apply(enginecap)
df.engine_capacity_cc_size.value_counts()
#Put above pickle in next full run
#create new column for Machine Learning and Visualization with Not Serious and Serious
df['accident_seriousness'] = df['accident_severity']
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Slight",
value="Not Serious")
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Serious",
value="Serious")
df['accident_seriousness'] = df['accident_seriousness'].replace(to_replace="Fatal",
value="Serious")
df.shape
df.accident_seriousness.value_counts()
#pickling everything to speed up restarting
df.to_pickle("df.pkl")
#import pickled file
df = pd.read_pickle("df.pkl")
df.head()
accidentsperyear = df.groupby(['year'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(10,5))
colors = sns.color_palette("plasma", n_colors=7)
sns.barplot(accidentsperyear.index,accidentsperyear.values, palette=colors)
sns.despine(top=True, right=True, left=True, bottom=True)
plt.title("Accidents Per Year",fontsize=20,fontweight="bold")
plt.xlabel("\nYear", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.savefig('accidentsperyear.png')
plt.tight_layout()
accidentspermonth = df.groupby(['month'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(20,10))
colors = sns.color_palette("plasma_r", n_colors=12)
mt=sns.barplot(accidentspermonth.index,accidentspermonth.values, palette=colors)
sns.despine(top=True, right=True, left=True, bottom=True)
#ax is the axes instance
group_labels = ['Jan', 'Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec' ]
mt.set_xticklabels(group_labels)
plt.title("Accidents Per Month",fontsize=20,fontweight="bold")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.xlabel("\nMonth", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.savefig('accidentspermonth.png')
plt.tight_layout()
weekdays = ['Monday', 'Tuesday','Wednesday','Thursday', 'Friday', 'Saturday', 'Sunday']
accweekday = df.groupby(['year', 'day_of_week']).size()
accweekday = accweekday.rename_axis(['year', 'day_of_week'])\
.unstack('day_of_week')\
.reindex(columns=weekdays)
plt.figure(figsize=(15,10))
plt.style.use('dark_background')
sns.heatmap(accweekday, cmap='plasma_r')
plt.title('\nAccidents by Weekday per Year\n', fontsize=14, fontweight='bold')
plt.xticks(fontsize=15)
plt.yticks(fontsize=12)
plt.xlabel('')
plt.ylabel('')
plt.savefig('accidentsbyweekdayperyear.png')
plt.show()
Fridays are the day of the week where the most accidents occur.
accidentsperseason = df.groupby(['season'])['accident_index'].count()
seaord=['spring', 'summer', 'fall','winter']
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(15,10))
sns.barplot(accidentsperseason.index,accidentsperseason.values, order=seaord,
saturation=1, palette='magma_r')
sns.despine(top=True, right=True, left=True, bottom=True)
plt.title("Accidents Per Season",fontsize=20,fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=12)
plt.xlabel("\nSeason", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.savefig('accidentsperseason.png')
#"Morning Rush (6-10)", "Day (10-12)", "Lunch Rush (12-14)","Afternoon (14-16)",
#"After Work Rush (16-18)", "Evening (18-22)", "Night (22-6)"
timeofdaygroups = {'1': "Morning Rush",
'2': "Day",
'3': "Lunch Rush",
'4': "Afternoon",
'5': "After Work Rush",
'6': "Evening",
'7': "Night"}
df['time_of_day']=df['time_of_day'].map(timeofdaygroups)
accidentspertod = df.groupby(['time_of_day'])['accident_index'].count()
# prepare plot
plt.style.use('dark_background')
plt.figure(figsize=(15,10))
tod=["Morning Rush", "Day", "Lunch Rush", "Afternoon",
"After Work Rush", "Evening", "Night"]
sns.barplot(accidentspertod.index,accidentspertod.values, order=tod, palette='rainbow')
sns.despine(top=True, right=True, left=True, bottom=True)
plt.title("Accidents Per Time of Day",fontsize=20,fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=12)
plt.xlabel("", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.tight_layout()
plt.savefig('accidentspertod.png')
%%HTML
<div class='tableauPlaceholder' id='viz1572176706313' style='position: relative'><noscript><a href='https://github.com/GenTaylor/Traffic-Accident-Analysis'><img alt=' ' src='https://public.tableau.com/static/images/Ac/AccidentForecasting/AccidentForecasting/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AccidentForecasting/AccidentForecasting' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Ac/AccidentForecasting/AccidentForecasting/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1572176706313'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
According to the forcasting above, traffic accidents will be slightly lower than years before but following similar trends throughout the months.
For correlation I used both Pearson and Spearman just in case there would be discrepancies. The order may have slightly varied but the "highest" correlated remained the same.
#correlation by accident severity pearson
corrdf=df.apply(LabelEncoder().fit_transform)
sc = StandardScaler()
corrdf = sc.fit_transform(corrdf)
corrdf=pd.DataFrame(data=corrdf,columns=df.columns)
corr=corrdf.corr()['accident_seriousness']
corr[np.argsort(corr,axis=0)[::-1]]
corr_spear=corrdf.corr(method='spearman')['accident_seriousness']
corr_spear[np.argsort(corr_spear,axis=0)[::-1]]
Looking at this I wanted to visualize some of the higher pos/negative correlations against accident severity.
Before these visualizations were done, I wanted to be sure that the visualizations were of some importance to accident_seriousness. For this, the chi-squared test was used.
"""chisquare algorithm from
http://www.insightsbot.com/blog/2AeuRL/chi-square-feature-selection-in-python """
class ChiSquare:
def __init__(self, dataframe):
self.df = dataframe
self.p = None #P-Value
self.chi2 = None #Chi Test Statistic
self.dof = None
self.dfObserved = None
self.dfExpected = None
def _print_chisquare_result(self, colX, alpha):
result = ""
if self.p<alpha:
result="The column {0} is IMPORTANT for Prediction".format(colX)
else:
result="The column {0} is NOT an important predictor. (Discard {0} from model)".format(colX)
print(result)
def TestIndependence(self,colX,colY, alpha=0.05):
X = self.df[colX].astype(str)
Y = self.df[colY].astype(str)
self.dfObserved = pd.crosstab(Y,X)
chi2, p, dof, expected = stats.chi2_contingency(self.dfObserved.values)
self.p = p
self.chi2 = chi2
self.dof = dof
self.dfExpected = pd.DataFrame(expected, columns=self.dfObserved.columns,
index = self.dfObserved.index)
self._print_chisquare_result(colX,alpha)
#Initialize ChiSquare Class
cT = ChiSquare(df)
#Feature Selection
testColumns = ['accident_index', '1st_road_class', '1st_road_number','2nd_road_number',
'carriageway_hazards', 'date', 'day_of_week',
'did_police_officer_attend_scene_of_accident','junction_control',
'junction_detail', 'latitude', 'light_conditions', 'local_authority_district',
'local_authority_highway', 'longitude','lsoa_of_accident_location',
'number_of_casualties', 'number_of_vehicles', 'pedestrian_crossing-human_control',
'pedestrian_crossing-physical_facilities', 'police_force','road_surface_conditions',
'road_type', 'special_conditions_at_site', 'speed_limit', 'time',
'urban_or_rural_area', 'weather_conditions', 'year', 'inscotland',
'age_band_of_driver', 'age_of_vehicle', 'driver_home_area_type',
'driver_imd_decile', 'engine_capacity_cc','hit_object_in_carriageway',
'hit_object_off_carriageway', 'journey_purpose_of_driver', 'junction_location',
'make', 'model','propulsion_code', 'sex_of_driver', 'skidding_and_overturning',
'towing_and_articulation', 'vehicle_leaving_carriageway',
'vehicle_locationrestricted_lane', 'vehicle_manoeuvre','vehicle_reference',
'vehicle_type', 'was_vehicle_left_hand_drive', 'x1st_point_of_impact', 'month',
'weekend', 'hour', 'time_of_day','season', 'engine_capacity_cc_size']
for var in testColumns:
cT.TestIndependence(colX=var,colY="accident_seriousness" )
For my visualizations I have decided to use some of the features with the highest correlations to accident_seriousness:
Note: The columns used were selected because of the absolute value of their correlation in relation to accident_seriousness
*columns added after correlation was done after undersampling
For visual reasons, two separate dataframes were created, for not serious and serious accidents. I wanted to better scale the data and for me, this was the simplest way of doing so.
#dataframe where accidents are Slight
not_serious = df[(df['accident_seriousness']=="Not Serious")]
print("Not Serious Group Shape:", not_serious.shape)
not_serious.accident_seriousness.value_counts()
#dataframe where accidents are serious
serious= df[(df['accident_seriousness']=="Serious")]
print("Serious Group Shape:", serious.shape)
serious.accident_seriousness.value_counts()
#map 1, 2, 3 in did_police_officer_attend_scene_of_accident with Yes, No,Self-reported
policeattend = {1: "Yes", 2:"No", 3:"Self-Reported"}
not_serious['did_police_officer_attend_scene_of_accident']=not_serious['did_police_officer_attend_scene_of_accident'].map(policeattend)
df['did_police_officer_attend_scene_of_accident']=df['did_police_officer_attend_scene_of_accident'].map(policeattend)
serious['did_police_officer_attend_scene_of_accident']=serious['did_police_officer_attend_scene_of_accident'].map(policeattend)
imddecile = {1:"Most deprived 10%", 2:"More deprived 10-20%", 3:"More deprived 20-30%",
4:"More deprived 30-40%", 5:"More deprived 40-50%", 6:"Less deprived 40-50%",
7:"Less deprived 30-40%", 8:"Less deprived 20-30%", 9:"Less deprived 10-20%",
10:"Least deprived 10%"}
not_serious['driver_imd_decile']=not_serious['driver_imd_decile'].map(imddecile)
df['driver_imd_decile']=df['driver_imd_decile'].map(imddecile)
serious['driver_imd_decile']=serious['driver_imd_decile'].map(imddecile)
#setups for adding frequencies to visualizations
dftotal= float(len(df))
nstotal= float(len(not_serious))
setotal= float(len(serious))
The below plots will look into if police officers attended the scene of an accident.
fig, ax =plt.subplots(1,2,figsize = (15,8))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 = sns.countplot( "did_police_officer_attend_scene_of_accident", hue="accident_seriousness",
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("did_police_officer_attend_scene_of_accident", hue="accident_seriousness",
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Did Police Officer Attend Scene Of Accident", fontsize=20, fontweight="bold")
ax1.set_xlabel('Attendance of Not Serious Accidents', fontsize=12, fontweight="bold")
ax2.set_xlabel('Attendance of Serious Accidents', fontsize=12, fontweight="bold")
ax1.set_ylabel('Number Attended')
ax2.set_ylabel('Number Attended')
ax1.get_legend().remove()
ax2.get_legend().remove()
plt.style.use('dark_background')
plt.savefig('did_police_officer_attend_scene_of_accident.png')
fig.show()
The below plots show the counts for the first spot in which vehicles were hit in an accident
# First Point of Impact Vs Accident Seriousness (Not Serious)
fpoa_order =["Front", "Nearside", "Did not impact", "Back", "Offside"]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("accident_seriousness",hue="x1st_point_of_impact", hue_order=fpoa_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness",hue="x1st_point_of_impact", hue_order=fpoa_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("First Point of Impact in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('First Point of Impact of Not Serious Accidents', fontsize=15, fontweight="bold")
ax2.set_xlabel('First Point of Impact of Serious Accidents', fontsize=15, fontweight="bold")
ax1.set_ylabel('First Point of Impact Count', fontsize=15, fontweight="bold")
ax2.set_ylabel('')
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('x1st_point_of_impact.png')
fig.show()
The below plots show the counts for number of vehicles in each accident.
nov_order=["1","2", "3", "4+"]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("accident_seriousness", hue="number_of_vehicles", hue_order=nov_order,
palette="GnBu_d", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="number_of_vehicles", hue_order=nov_order,
palette="GnBu_d", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Number of Vehicles in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('Number of Vehicles of Not Serious Accidents', fontsize=15, fontweight="bold")
ax2.set_xlabel('Number of Vehicles of Serious Accidents', fontsize=15, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('')
plt.style.use('dark_background')
plt.savefig('number_of_vehicles.png')
fig.show()
The below graphs show the speed limit by accident in areas where the accidents occured. ed Limit vs Accident Seriousness
splt_order=[15, 20,30,40,50,60,70]
splt_order2=[20,30,40,50,60,70]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
ax1 =sns.countplot("accident_seriousness", hue="speed_limit", hue_order=splt_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.4f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="speed_limit", hue_order=splt_order2,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Speed Limit in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('Speed Limit of Not Serious Accidents', fontsize=15, fontweight="bold")
ax2.set_xlabel('Speed Limit of Serious Accidents', fontsize=15, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
plt.style.use('dark_background')
plt.savefig('speed_limit.png')
fig.show()
The graphs below show whether the accidents occured in an Urban or Rural Area.
fig, ax =plt.subplots(1,2,figsize = (15,8))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("accident_seriousness", hue="urban_or_rural_area",
palette="PuBu", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="urban_or_rural_area",
palette="PuBu", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Urban or Rural Areas vs Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('\nUrban or Rural Areas vs Not Serious Accidents', fontsize=14, fontweight="bold")
ax2.set_xlabel('\nUrban or Rural Areas vs Serious Accidents', fontsize=14, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('')
plt.style.use('dark_background')
plt.savefig('urban_or_rural_area.png')
fig.show()
The below graphs show if any skidding, jackniffing, and/or overturnning occured in the acccident.
sao_order=["None", "Skidded", "Skidded and overturned", "Overturned", "Jackknifed",
"Jackknifed and overturned"]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="skidding_and_overturning", hue_order=sao_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="skidding_and_overturning", hue_order=sao_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Skidding and Overturning in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('\nSkidding and Overturning in Not Serious Accidents', fontsize=14, fontweight="bold")
ax2.set_xlabel('\nSkidding and Overturning in Serious Accidents', fontsize=14, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
plt.style.use('dark_background')
plt.savefig('skidding_and_overturning.png')
fig.show()
The below graphs show if a vehicle left the carriageway, and if they did, where did they do so.
vlc_order=["Did not leave carriageway", "Straight ahead at junction", "Nearside",
"Offside", "Offside on to central reservation", "Nearside and rebounded",
"Offside - crossed central reservation", "Offside and rebounded",
"Offside on to centrl res + rebounded"]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="vehicle_leaving_carriageway", hue_order=vlc_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="vehicle_leaving_carriageway", hue_order=vlc_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Vehicle Leaving Carriageway in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Not Serious Accidents\n\n', fontsize=13, fontweight="bold")
ax2.set_xlabel('Serious Accidents', fontsize=13, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('vehicle_leaving_carriageway.png')
fig.show()
The below graphs show the sex of the drivers in the accidents.
sod_order=["Female", "Male", "Not known"]
fig, ax =plt.subplots(1,2,figsize = (15,8))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("accident_seriousness", hue="sex_of_driver", hue_order=sod_order,
palette="magma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="sex_of_driver", hue_order=sod_order,
palette="magma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Sex of Driver in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('\nSex of Driver in Not Serious Accidents', fontsize=14, fontweight="bold")
ax2.set_xlabel('\nSex of Driver in Serious Accidents', fontsize=14, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('')
plt.style.use('dark_background')
plt.savefig('sex_of_driver.png')
fig.show()
The graphs below are about the number of accidents by type of vehicle.
vt_order=['Bus', 'Car', 'Goods Vehicle', 'Motorcycle', 'Other Vehicle']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="vehicle_type", hue_order=vt_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="vehicle_type", hue_order=vt_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Vehicle Type in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Vehicle Type in Not Serious Accidents\n\n', fontsize=13, fontweight="bold")
ax2.set_xlabel('Vehicle Type in Serious Accidents', fontsize=13, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('vehicle_type.png')
fig.show()
The graphs below depict the types of moves vehicles made that led to the accident.
vm_order=['Turning right', 'Going ahead other', 'Going ahead right-hand bend',
'Slowing or stopping', 'Turning left', 'Waiting to go - held up',
'Waiting to turn right', 'Overtaking static vehicle - offside' ,
'Parked', 'Overtaking - nearside', 'U-turn', 'Changing lane to right',
'Reversing', 'Waiting to turn left', 'Changing lane to left',
'Going ahead left-hand bend', 'Overtaking moving vehicle - offside', 'Moving off']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="vehicle_manoeuvre",hue_order=vm_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="vehicle_manoeuvre",hue_order=vm_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Vehicle Manuevers in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Vehicle Manuevers in Not Serious Accidents\n\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Vehicle Manuevers in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.tight_layout() # No overlap of subplots
plt.style.use('dark_background')
plt.savefig('vehicle_manoeuvre.png')
fig.show()
This area is another look at the type of area the accident occured in, whether Rural, Urban, or Small Town.
dhoa_order=['Urban area', 'Rural', 'Small town']
fig, ax =plt.subplots(1,2,figsize = (15,8))
sns.despine(top=True, right=True, left=False, bottom=False)
ax1 =sns.countplot("accident_seriousness", hue="driver_home_area_type", hue_order=dhoa_order,
palette="magma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="driver_home_area_type", hue_order=dhoa_order,
palette="magma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Driver Home Type Area in Accidents", fontsize=20, fontweight="bold")
ax1.set_xlabel('\nDriver Home Type Area in Not Serious Accidents', fontsize=14, fontweight="bold")
ax2.set_xlabel('\nDriver Home Type Area in Serious Accidents', fontsize=14, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.set_ylabel('')
plt.style.use('dark_background')
plt.savefig('driver_home_area.png')
fig.show()
Thr graphs below show accidents by age groups of the drivers.
#age_band_of_driver
abod_order=['Under 16', '16-25', '26-45', '46-65','Over 65']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="age_band_of_driver", hue_order=abod_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="age_band_of_driver", hue_order=abod_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Age Band of Driver in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Age Band of Driver in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Age Band of Driver in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('age_band_of_driver.png')
fig.show()
The following graphs show what type of traffic signs or signals were up in the accident area, if any.
jc_order = ['Give way or uncontrolled', 'Auto traffic signal', 'Authorised person',
'Stop sign','Not at junction or within 20 metres']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="junction_control", hue_order=jc_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="junction_control", hue_order=jc_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Junction Control in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Junction Control in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Junction Control in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('junction_control.png')
fig.show()
The following graphs show if a vehicle hit an object off of the road and what object, if they hit one during the accident.
hooffc_order=['None', 'Lamp post', 'Road sign or traffic signal', 'Other permanent object',
'Entered ditch', 'Tree', 'Near/Offside crash barrier','Central crash barrier',
'Bus stop or bus shelter', 'Telegraph or electricity pole', 'Submerged in water',
'Wall or fence']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="hit_object_off_carriageway", hue_order=hooffc_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="hit_object_off_carriageway", hue_order=hooffc_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Objects Hit Off Carriageway in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Objects Hit Off Carriageway in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Objects Hit Off Carriageway in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('hit_object_off_carriageway.png')
fig.show()
The following graphs show if a vehicle hit an object on the road and what object, if they hit one during the accident.
hoinc_order=['None', 'Kerb', 'Other object', 'Bollard or refuge', 'Parked vehicle',
'Road works', 'Open door of vehicle', 'Central island of roundabout',
'Previous accident', 'Bridge (side)', 'Any animal (except ridden horse)',
'Bridge (roof)']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,12))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="hit_object_in_carriageway", hue_order=hoinc_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.3f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="hit_object_in_carriageway", hue_order=hoinc_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.3f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Objects Hit in Carriageway in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Objects Hit in Carriageway in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Objects Hit in Carriageway in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('hit_object_in_carriageway.png')
fig.show()
The Driver IMD Decile is the score for the deprivation of an area. The graphs below show accidents by how deprived an area was at the time of the accident.
imd_order=["Least deprived 10%", "Less deprived 10-20%", "Less deprived 20-30%",
"Less deprived 30-40%","Less deprived 40-50%","Most deprived 10%",
"More deprived 10-20%", "More deprived 20-30%", "More deprived 30-40%",
"More deprived 40-50%"]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,15))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="driver_imd_decile", hue_order=imd_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="driver_imd_decile", hue_order=imd_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Driver Area Deprivation Scores in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Driver Area Deprivation Scores in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Driver Area Deprivation Scores in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('driver_imd_decile.png')
fig.show()
The following graphs show the road features in relations to where the accidents occured.
jud_order=['T or staggered junction', 'Mini-roundabout', 'Crossroads',
'Private drive or entrance', 'More than 4 arms (not roundabout)',
'Roundabout', 'Slip road', 'Other junction','Not at junction or within 20 metres']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,15))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="junction_detail", hue_order=jud_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="junction_detail", hue_order=jud_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Junction Details in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Junction Details in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Junction Details in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('junction_detail.png')
fig.show()
The graphs below show where the accidents occured on the roads.
jul_order=['Mid Junction - on roundabout or on main road', 'Entering main road',
'Approaching junction or waiting/parked at junction approach',
'Cleared junction or waiting/parked at junction exit', 'Leaving main road',
'Leaving roundabout', 'Entering roundabout', 'Entering from slip road',
'Not at or within 20 metres of junction']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,15))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="junction_location", hue_order=jul_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="junction_location", hue_order=jul_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Junction Locations in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Junction Locations in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Junction Locations in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=15, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('junction_location.png')
fig.show()
The propulsion ode is the type of fuel used to power the car. The graphs below show what type of fuel was used in the vehicles in the accidents.
pd_order=['Petrol', 'Heavy oil', 'Hybrid electric', 'Bio-fuel', 'LPG Petrol', 'Diesel',
'Fuel cells', 'New fuel technology', 'Electric diesel']
pd_order2=['Petrol', 'Heavy oil', 'Hybrid electric', 'Bio-fuel', 'LPG Petrol', 'Electric diesel']
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,15))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="propulsion_code", hue_order=pd_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="propulsion_code", hue_order=pd_order2,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Propulsion Codes in Accidents", fontsize=18, fontweight="bold")
ax1.set_xlabel('Propulsion Codes in Not Serious Accidents\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Propulsion Codes in Serious Accidents', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=13.5, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=13.5, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('propulsion_code.png')
fig.show()
The year of the accidents.
year_order=[2010, 2011, 2012, 2013, 2014, 2015, 2016]
fig, ax =plt.subplots(nrows=2, ncols=1,figsize = (15,15))
sns.despine(top=False, right=True, left=True)
ax1 =sns.countplot("accident_seriousness", hue="year", hue_order=year_order,
palette="plasma", data=not_serious, ax=ax[0])
for p in ax1.patches:
height = p.get_height()
ax1.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/nstotal*100),
ha="center",fontsize=12)
ax2 = sns.countplot("accident_seriousness", hue="year", hue_order=year_order,
palette="plasma", data=serious, ax=ax[1])
for p in ax2.patches:
height = p.get_height()
ax2.text(p.get_x()+p.get_width()/2.,
height + 4,
'{:1.2f}%'.format(height/setotal*100),
ha="center",fontsize=12)
fig.suptitle("Accidents by Year", fontsize=18, fontweight="bold")
ax1.set_xlabel('Not Serious Accidents by Year\n', fontsize=13.5, fontweight="bold")
ax2.set_xlabel('Serious Accidents by Year', fontsize=13.5, fontweight="bold")
ax1.set_ylabel('Number of Accidents', fontsize=13.5, fontweight="bold")
ax2.set_ylabel('Number of Accidents', fontsize=13.5, fontweight="bold")
ax1.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
ax2.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.style.use('dark_background')
plt.savefig('year.png')
fig.show()
Due to the previous visualizations a comparison of certain variables was desired to see more correlations.
The following graph shows what type of traffic control were in specific areas of the road where accidents occured.
#Not Serious Accident
plt.figure(figsize=(20,15))
ax=sns.countplot("junction_control", hue="junction_detail",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("Junction Control by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('junction_control_by_junction_detail.png')
plt.show()
The graph below is a more detailed look at junction areas in relation to the accidents.
plt.figure(figsize=(20,15))
ax=sns.countplot("junction_control", hue="junction_location",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("Junction Control by Junction Location in Accidents",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('junction_control_by_junction_location.png')
plt.show()
The graph below shows where impact first occured in the detailed road area type.
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_detail",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Detail",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_detail.png')
plt.show()
The graph below shows where the accident occured and what was the first point of impact.
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_location",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Location",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_location.png')
plt.show()
The following graph shows what type of traffic controls (signange or otherwise) were present at the first point of impact.
plt.figure(figsize=(20,15))
ax=sns.countplot("x1st_point_of_impact", hue="junction_control",
palette="plasma", data=df)
plt.style.use('dark_background')
plt.title("First point of Impact by Junction Control",fontsize=25,fontweight="bold")
plt.xlabel("\nAccident by Year", fontsize=15, fontweight="bold")
plt.legend().set_title('')
plt.legend(fontsize='22', loc = 'upper right')
plt.ylabel("\nNumber of Accidents", fontsize=15, fontweight="bold")
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.legend(fontsize='15', bbox_to_anchor=(1.04, 1), loc='upper right', ncol=1)
# plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('x1st_point_of_impact_by_junction_control.png')
plt.show()
No matter the situation above, the most accidents were involving areas that were uncontrolled. One of the main ones were the junction Detail T or staggered junction.
Other areas of concern include Mid Junctions on roundabouts or main roads and areas approaching a junction were cars were either parking or waiting in the junction.
From the data above more controlled areas would be benefical. Maybe signs alerting drivers of the upcoming junctions, traffic lights, or stop signs would help in some of these areas where they are feasible.
For example, this is a staggered junction, the main junction detail in accidents. One can understand how a situation such as these can lead to numerous accidents especially if proper signage is not available. Perhaps traffic lights, stop signs, or warnings indicating that they are approaching certain junctions would help reduce accidents.
Below you wll find a web scrape of the website, Learner Driving Centres, which contains information on road signs in the UK. They were pulled to show examples of signage available to be placed.
#request website
r = requests.get('https://www.learnerdriving.com/learn-to-drive/highway-code/road-signs')
#parse HTML
soup = BeautifulSoup(r.text, 'html.parser')
#filter results
results = soup.find_all('div', attrs={'class':'fifth'})
#done to find specific results area
first_result=results[0]
first_result
first_result.find('img')['src']
#get images of signs and sign descriptions
signage = []
for result in results:
sign=result.find('img')['src']
sign_desc=result.contents[1]
signage.append((sign, sign_desc))
#put pulled UK Traffic Signs into dataframe
uktrafficsigns = pd.DataFrame(signage, columns=['Sign', 'Sign Description'])
uktrafficsigns.head()
'''
the "image" is just part of the image link,
must parse the first half in order to have full image link
'''
uktrafficsigns['Sign'] = 'https://www.learnerdriving.com/'+uktrafficsigns['Sign']
uktrafficsigns.head()
'''
In some coding below I saw that one of the fields was blank (at index 42) but was not reading as null.
In order to fix that I changed the "Sign Description" and decided to place it here.
'''
uktrafficsigns.at[42,'Sign Description']="T-junction with priority over vehicles from the right"
#I wanted to save this as a csv for later, and to stop unnecessary web scraping
uktrafficsigns.to_csv('uktrafficsigns.csv', header=False, index=False)
#I wanted the html to show up as images instead of links
def path_to_image_html(path):
return '<img src="'+ path + '" width="60" >'
pd.set_option('display.max_colwidth', -1)
ukts=HTML(uktrafficsigns.to_html(escape=False ,formatters=dict(Sign=path_to_image_html)))
HTML(uktrafficsigns.to_html(escape=False ,formatters=dict(Sign=path_to_image_html)))
'''
Here I am creating a df that will allow me to pull all junction signs.
"ction" was used instead of "junction" in order to pull all variables.
'''
junction =uktrafficsigns[uktrafficsigns['Sign Description'].str.contains("nction", regex=False)]
#Making it its own HTML object (same as above)
def path_to_image_html(path):
return '<img src="'+ path + '" width="60" >'
pd.set_option('display.max_colwidth', -1)
HTML(junction.to_html(escape=False ,formatters=dict(Sign=path_to_image_html)))
#Repeated the above steps for giveways
give=uktrafficsigns[uktrafficsigns['Sign Description'].str.contains("ive ", regex=False)]
def path_to_image_html(path):
return '<img src="'+ path + '" width="60" >'
pd.set_option('display.max_colwidth', -1)
HTML(give.to_html(escape=False ,formatters=dict(Sign=path_to_image_html)))
#roundabouts
roundabout=uktrafficsigns[uktrafficsigns['Sign Description'].str.contains("ounda", regex=False)]
def path_to_image_html(path):
return '<img src="'+ path + '" width="60" >'
pd.set_option('display.max_colwidth', -1)
HTML(roundabout.to_html(escape=False ,formatters=dict(Sign=path_to_image_html)))
Below we used Tableau to map what could be deemed problem areas for the UK. These are accidents in areas with high deprivation (driver_imd_decile @ more deprived 40-50%) and no signange at T or staggered junctions.
%%HTML
<div class='tableauPlaceholder' id='viz1572177057382' style='position: relative'><noscript><a href='https://github.com/GenTaylor/Traffic-Accident-Analysis'><img alt=' ' src='https://public.tableau.com/static/images/Ac/AccidentForecasting/SeriousAccidentsinAreaswithHighDeprivationandNoSignage/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='AccidentForecasting/SeriousAccidentsinAreaswithHighDeprivationandNoSignage' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Ac/AccidentForecasting/SeriousAccidentsinAreaswithHighDeprivationandNoSignage/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1572177057382'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script>
#made separate dataframe w. set index that wouldnt effect data vis above
df1=df
#set index to accident_index
df1.set_index('accident_index', inplace=True)
df1.head()
df1 = df1.drop(['accident_severity'],axis=1)
df1.head()
print(df1.columns)
#separate dtypes
notif=df1.select_dtypes(exclude=['int','float','int64'])
intfldtypes = df1.select_dtypes(include=['int','float','int64'])
print('Objects',notif.columns)
print("\nNonObjects",intfldtypes.columns)
#checking to make sure all are accounted for
print(df1.shape)
print(notif.shape)
print(intfldtypes.shape)
Label Encoder was used instead of OneHotEncoder due to the memory errors One Hot Encoder caused in the data. The algorithms used will be classifiers, through boosting and trees, and not linear.
#label encode objects
obj_le= notif.apply(LabelEncoder().fit_transform)
#re-add with non-objects
df_ml= pd.concat([obj_le,intfldtypes], axis=1, sort=False)
#check shape
print(df_ml.shape)
#Set up of X and Y
X= df_ml.drop(['accident_seriousness'],axis=1)
y= df_ml['accident_seriousness']
df_ml.accident_seriousness.value_counts()
df.dtypes
plt.figure(figsize=(12,6))
ax=sns.countplot(x="accident_seriousness", palette="magma", data=df)
plt.style.use('dark_background')
plt.title("Accident Seriousness",fontsize=25,fontweight="bold")
plt.xlabel("", fontsize=15, fontweight="bold")
plt.ylabel("\nNumber of Accidents\n", fontsize=15, fontweight="bold")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
sns.despine(top=True, right=True, left=True, bottom=False)
plt.savefig('accident_seriousness.png')
plt.show()
The data in this dataset is extremely imbalanced for what we are trying to predict. We are going to resample the data as undersampling, where we reduce the number of majority (Not Serious Accidents) samples.
The machine learning classifier algorithms that we are going to use are as follows:
*Gradient Boosting was commented out because of the time it took to run (18hrs) and not having relevant enough results to still consider.
# setting up testing and training sets
res_X_train, res_X_test, res_y_train, res_y_test = train_test_split(X, y,
test_size=0.25, random_state=27)
# concatenate our training data back together
res_X = pd.concat([res_X_train, res_y_train], axis=1)
# separate minority and majority classes
not_severe = res_X[res_X.accident_seriousness==0]
severe = res_X[res_X.accident_seriousness==1]
# decrease majority
not_severe_decreased = resample(not_severe,
replace=True, # sample with replacement
n_samples=len(severe), # match number in majority class
random_state=27) # reproducible results
# combine majority and severe_increased minority
newdf = pd.concat([severe, not_severe_decreased])
newdf.accident_seriousness.value_counts()
res_X_train = newdf.drop('accident_seriousness', axis=1)
res_y_train = newdf.accident_seriousness
Before, we get in to predictions, we are going to complete some machine learning in ordered to see how the data relates to each other. We are going to do this on the resampled data as well, in order to avoid bias. We will use two clusters which, in theory, represent the two variables for accident_seriousness, Not Serious & Serious
# "clustering" using kmode algorithm that is designed to handle mixed data
km_huang = KModes(n_clusters=2, init = "Huang", n_init = 1)
fitClusters_huang = km_huang.fit_predict(newdf)
fitClusters_huang
newdf1 = newdf.copy().reset_index()
clustersDf = pd.DataFrame(fitClusters_huang)
clustersDf.columns = ['cluster_predicted']
combinedDf = pd.concat([newdf1, clustersDf], axis = 1).reset_index()
combinedDf = combinedDf.drop(['index'], axis = 1)
combinedDf.head()
#plotting a few of these features just to see how they relate to the clustering for seriousness
f, axs = plt.subplots(nrows=6, ncols=3,figsize = (12,24))
plt.style.use('dark_background')
sns.countplot(x=combinedDf['did_police_officer_attend_scene_of_accident'],
order=combinedDf['did_police_officer_attend_scene_of_accident'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[0,0])
sns.countplot(x=combinedDf['x1st_point_of_impact'],
order=combinedDf['x1st_point_of_impact'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[0,1])
sns.countplot(x=combinedDf['number_of_vehicles'],
order=combinedDf['number_of_vehicles'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[0,2])
sns.countplot(x=combinedDf['speed_limit'],
order=combinedDf['speed_limit'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[1,0])
sns.countplot(x=combinedDf['urban_or_rural_area'],
order=combinedDf['urban_or_rural_area'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[1,1])
sns.countplot(x=combinedDf['skidding_and_overturning'],
order=combinedDf['skidding_and_overturning'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[1,2])
sns.countplot(x=combinedDf['vehicle_leaving_carriageway'],
order=combinedDf['vehicle_leaving_carriageway'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[2,0])
sns.countplot(x=combinedDf['sex_of_driver'],
order=combinedDf['sex_of_driver'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[2,1])
sns.countplot(x=combinedDf['vehicle_type'],
order=combinedDf['vehicle_type'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[2,2])
sns.countplot(x=combinedDf['junction_control'],
order=combinedDf['junction_control'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[3,0])
sns.countplot(x=combinedDf['number_of_casualties'],
order=combinedDf['number_of_casualties'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[3,1])
sns.countplot(x=combinedDf['age_band_of_driver'],
order=combinedDf['age_band_of_driver'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[3,2])
sns.countplot(x=combinedDf['junction_detail'],
order=combinedDf['junction_detail'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[4,0])
sns.countplot(x=combinedDf['junction_location'],
order=combinedDf['junction_location'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[4,1])
sns.countplot(x=combinedDf['driver_imd_decile'],
order=combinedDf['driver_imd_decile'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[4,2])
sns.countplot(x=combinedDf['junction_detail'],
order=combinedDf['junction_detail'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[5,0])
sns.countplot(x=combinedDf['junction_location'],
order=combinedDf['junction_location'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[5,1])
sns.countplot(x=combinedDf['driver_imd_decile'],
order=combinedDf['driver_imd_decile'].value_counts().index,
hue=combinedDf['cluster_predicted'], palette='PuBu', ax=axs[5,2])
plt.tight_layout()
plt.savefig('clusterplot.png')
plt.show()
Looking at these graphs we can see the patterns of how each category of eacch column pairs off with the clustering on accident_seriousness.
#confusion matrix plot function
def cm_plot(var):
plt.figure(figsize=(15,5))
plt.style.use('dark_background')
plt.clf()
plt.imshow(var, interpolation='nearest', cmap='tab20')
classNames = ['Not Serious','Serious']
plt.title('Confusion Matrix')
plt.ylabel('Actual\n')
plt.xlabel('Predicted\n')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames)
plt.yticks(tick_marks, classNames)
s = [['TN','FP'], ['FN', 'TP']]
for i in range(2):
for j in range(2):
plt.text(j,i, str(s[i][j])+"="+str(var[i][j]),horizontalalignment='center',
color='black')
plt.show()
First, we are going to run some standard classifier algorithms using the resampling method from above, gather the results of some scoring metrics (Accuracy, Log Loss, Cross Validation, Recall, Roc Auc, F1, False Positive Rate, Error Rate), and put those scores into a dataframe
#Try modeling using different classification models
classifiers = [
BaggingClassifier(max_features=X.shape[1], n_estimators=500, random_state=42),
AdaBoostClassifier( n_estimators=500, learning_rate=0.05, random_state=42),
RandomForestClassifier(criterion='entropy', max_depth=40,max_features=X.shape[1],
min_samples_split=8, n_estimators=500, random_state=42),
LGBMClassifier(learning_rate =0.03, max_depth=40, min_data_in_leaf=10,
n_estimators=500, num_leaves=50, random_state = 42),
XGBClassifier(learning_rate=0.05, n_estimators=500, subsample= 1,random_state = 42,
gamma = 1, max_depth=40)]
#putting results in df
res_cols=["Classifier", "Accuracy", "Log Loss", "Cross Val", "Recall", "Roc Auc","F1",
"False Positive Rate", "Error Rate"]
results = pd.DataFrame(columns=res_cols)
for clf in classifiers:
clf.fit(res_X_train, res_y_train)
name = clf.__class__.__name__
print("\n"*3)
print(name,"Results:")
print('~'*40)
y_pred = clf.predict(res_X_test)
acc = accuracy_score(res_y_test, y_pred)
print("Accuracy: {:.4%}".format(acc))
cv= np.mean(cross_val_score(clf, res_X_train, res_y_train, cv=3))
print("Cross validation scores:",cv)
train_predictions = clf.predict_proba(res_X_test)
logloss = log_loss(res_y_test, train_predictions)
print("Log Loss: {}".format(logloss))
cm = confusion_matrix(res_y_test, y_pred)
cm_plot(cm)
#FPR and Error Rate setup
tn, fp, fn, tp = confusion_matrix(res_y_test,y_pred).ravel()
fpr = fp/(tn+fp)
ers = 1-acc
rec= recall_score(res_y_test, y_pred)
roc=roc_auc_score(res_y_test, y_pred)
f1s=f1_score(res_y_test, y_pred)
results_final = pd.DataFrame([[name, round(acc*100,3), round(logloss,3),
round(cv*100,3), round(rec*100,3), round(roc*100,3),
round(f1s*100,3),round(fpr*100,3),round(ers*100,3)]],
columns=res_cols)
results = results.append(results_final)
print("*"*40)
print("Results Shape",results.shape)
results.head(10)
For the following Balanced algorithms from imblearn we will be using the standard testing and training sets (X_train, X_test, y_train, y_test) and will allow the algorithms to do the resampling.
For the sampling_strategy, we will be using majority as the solution.
'majority': resample only the majority class
We will then gather the results of some scoring metrics (Accuracy, Log Loss, Cross Validation, Recall, Roc Auc, F1, False Positive Rate, Error Rate), and put those scores into a dataframe.
#train_tes_split without resampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)
#Try modeling using different classification models
classifiers2 = [
BalancedBaggingClassifier(max_features=X.shape[1], n_estimators=500, replacement=True,
sampling_strategy='majority', random_state=42),
EasyEnsembleClassifier(n_estimators=500, random_state=42, replacement=True,
sampling_strategy='majority'),
BalancedRandomForestClassifier(criterion='entropy', max_depth=40,min_samples_leaf = 1,
max_features=X.shape[1], sampling_strategy='majority',
replacement=True, min_samples_split=8, n_estimators=500,
random_state=42)]
#putting results in df
res_cols2=["Classifier", "Accuracy", "Log Loss", "Cross Val", "Recall", "Roc Auc","F1",
"False Positive Rate", "Error Rate"]
results2 = pd.DataFrame(columns=res_cols2)
for clf2 in classifiers2:
clf2.fit(X_train, y_train)
name2 = clf2.__class__.__name__
print("\n"*3)
print(name2,"Results:")
print('~'*40)
y_pred2 = clf2.predict(X_test)
acc2 = accuracy_score(y_test, y_pred2)
print("Accuracy: {:.4%}".format(acc2))
cv2= np.mean(cross_val_score(clf2, X_train, y_train, cv=3))
print("Cross validation scores:",cv2)
train_predictions2 = clf2.predict_proba(X_test)
logloss2 = log_loss(y_test, train_predictions2)
print("Log Loss: {}".format(logloss2))
cm2 = confusion_matrix(y_test, y_pred2)
cm_plot(cm2)
#FPR and Error Rate setup
tn, fp, fn, tp = confusion_matrix(y_test,y_pred2).ravel()
fpr2 = fp/(tn+fp)
ers2 = 1-acc
rec2= recall_score(y_test, y_pred2)
roc2=roc_auc_score(y_test, y_pred2)
f1s2=f1_score(y_test, y_pred2)
results_final2 = pd.DataFrame([[name2, round(acc2*100,3), round(logloss2,3),
round(cv2*100,3), round(rec2*100,3), round(roc2*100,3),
round(f1s2*100,3),round(fpr2*100,3),round(ers2*100,3)]],
columns=res_cols2)
results2 = results2.append(results_final2)
print("*"*40)
print("Results 2 Shape",results2.shape)
results2.head(10)
We will now combine the dataframes from both methods into one datframe for analyzing and visualizations
ml_results = pd.concat([results,results2])
print("Shape",ml_results.shape)
ml_results.head(10)
#save to csv
ml_results.to_csv('ml_results.csv')
#Visualize scores for all model
fig, ax =plt.subplots(nrows=8, ncols=1, figsize = (11,18))
plt.style.use('dark_background')
sns.barplot(x='Accuracy', y='Classifier', data=ml_results, palette='plasma', ax=ax[0])
sns.barplot(x='Log Loss', y='Classifier', data=ml_results, palette='plasma', ax=ax[1])
sns.barplot(x='Cross Val', y='Classifier', data=ml_results, palette='plasma', ax=ax[2])
sns.barplot(x='Recall', y='Classifier', data=ml_results, palette='plasma', ax=ax[3])
sns.barplot(x='Roc Auc', y='Classifier', data=ml_results, palette='plasma', ax=ax[4])
sns.barplot(x='F1', y='Classifier', data=ml_results, palette='plasma', ax=ax[5])
sns.barplot(x='False Positive Rate', y='Classifier', data=ml_results, palette='plasma', ax=ax[6])
sns.barplot(x='Error Rate', y='Classifier', data=ml_results, palette='plasma', ax=ax[7])
plt.tight_layout()
plt.show()
Based on the visualizations above, Balanced Bagging Classifier from imblearn is the algorithm of choice for this data. While some of the scores may have been close, Balanced Bagging Classifier had higher scores in Accuracy, Cross Validation, and Specificity. The algorithm also had the lower Error Rate and False Positive Rates of the group.
Balanced Bagging Classifier performed thest best of the classifiers, however, I was not comfortable with how close its predictions were for Serious Accidents in the confusion matrix. Due to this, I decided to combine Balanced Bagging Classifier with the second highest performing algorithm, LightGBM to see what results I would get.
#start
start_res_bbag_w_lgbm = time.time()
# Balanced Bagging Classifier
res_bbag_w_lgbm = BalancedBaggingClassifier(base_estimator=LGBMClassifier(learning_rate =0.03,
max_depth=40,
min_data_in_leaf=10,
n_estimators=500,
num_leaves=50,
random_state = 42),
max_features=X.shape[1], n_estimators=500,
replacement=True, sampling_strategy='majority',
random_state=42)
res_bbag_w_lgbm.fit(X_train, y_train)
pred_res_bbag_w_lgbm = res_bbag_w_lgbm.predict(X_test)
# Creates a confusion matrix
res_bbag_w_lgbm_cm = confusion_matrix(y_test,pred_res_bbag_w_lgbm)
# Transform to df for easier plotting
res_bbag_w_lgbm_cm_df = pd.DataFrame(res_bbag_w_lgbm_cm,
index = ['Not Serious','Serious'],
columns = ['Not Serious','Serious'])
plt.figure(figsize=(15,5))
plt.style.use('dark_background')
sns.heatmap(res_bbag_w_lgbm_cm_df, annot=True, fmt="d", cmap='viridis', linecolor='black', linewidths=1)
plt.title('Resampled Balanced Bagging with LightGBM Accuracy: {0:.2f}%'.format(accuracy_score(y_test,pred_res_bbag_w_lgbm )*100),
fontsize=15)
plt.ylabel('Actual\n')
plt.xlabel('Predicted\n')
plt.show()
# print("Resampled Balanced Bagging with LightGBM Classifier Cross Validation Score: {:0.2f}%"
# .format(np.mean(cross_val_score(res_bbag_w_lgbm, X_train, y_train, cv=3)*100)))
print('Cross Val Score was 69.67%. It was commented out here to save time when re-running.')
print('Check UK_Road_Safety_Traffic_Accidents_and_Vehicles(old).ipynb to see proof')
print('\n')
#end
end_res_bbag_w_lgbm = time.time()
print("\nResampled Balanced Bagging with LightGBM Time: ",end_res_bbag_w_lgbm - start_res_bbag_w_lgbm)
#extracting true_positives, false_positives, true_negatives, false_negatives
tn, fp, fn, tp = confusion_matrix(y_test,pred_res_bbag_w_lgbm).ravel()
accuracy = accuracy_score(y_test,pred_res_bbag_w_lgbm)*100
specificity = tn/(tn+fp)*100
fpr = fp/(tn+fp)*100
ers = 100-accuracy
train_predictions2 = res_bbag_w_lgbm.predict_proba(X_test)
print("Resampled Balanced Bagging Classifier with LightGBM Specificity Score: {0:.2f}%".format(specificity))
print("Resampled Balanced Bagging Classifier with LightGBM False Positive Rate Score: {0:.2f}%".format(fpr))
print("Resampled Balanced Bagging Classifier with LightGBM Error Rate Score: {0:.2f}%".format(ers))
#Check scores
print("Resampled Balanced Bagging Classifier with LightGBM Accuracy Score: {:0.2f}%"
.format(accuracy_score(y_test,pred_res_bbag_w_lgbm )*100))
print("Resampled Balanced Bagging Classifier with LightGBM F1 Score: {:0.2f}%"
.format(f1_score(y_test, pred_res_bbag_w_lgbm,average="macro")*100))
print("Resampled Balanced Bagging Classifier with LightGBM Precision Scoreres_: {:0.2f}%"
.format(precision_score(y_test, pred_res_bbag_w_lgbm, average="macro")*100))
print("Resampled Balanced Bagging Classifier with LightGBM Recall Score: {:0.2f}%"
.format(recall_score(y_test, pred_res_bbag_w_lgbm, average="macro")*100))
print("Resampled Balanced Bagging Classifier with LightGBM Roc Auc Score: {0:.2f}%"
.format(roc_auc_score(y_test, pred_res_bbag_w_lgbm)*100))
print("Resampled Balanced Bagging Classifier with LightGBM Log Loss {0:.2f}%"
.format(log_loss(y_test, train_predictions2)*100))
The results were better than the other learning algorithms but lower accuracy wise than the previous Balanced Bagging Algorithm. Taking all of that into consideration, I have decided that depending on what was the goal, either Balanced Bagging Classifier algorithm could be used. If I were more concerned with overall accuracy, the regular Balanced Bagging Classifier would be used. If I were more concerned with making sure "Serious" predictions were achieved, Balanced Bagging Classifier with LightGBM would be used.
ml_results = ml_results.append(pd.Series(["BalancedBaggingClassifierW/LGBM", 69.140,0.582,
69.670,68.240,68.240,57.140,30.570,30.860],
index=ml_results.columns),ignore_index=True)
ml_results.head(10)
#Visualize scores for all model
fig, ax =plt.subplots(nrows=8, ncols=1, figsize = (11,18))
plt.style.use('dark_background')
sns.barplot(x='Accuracy', y='Classifier', data=ml_results, palette='plasma', ax=ax[0])
sns.barplot(x='Log Loss', y='Classifier', data=ml_results, palette='plasma', ax=ax[1])
sns.barplot(x='Cross Val', y='Classifier', data=ml_results, palette='plasma', ax=ax[2])
sns.barplot(x='Recall', y='Classifier', data=ml_results, palette='plasma', ax=ax[3])
sns.barplot(x='Roc Auc', y='Classifier', data=ml_results, palette='plasma', ax=ax[4])
sns.barplot(x='F1', y='Classifier', data=ml_results, palette='plasma', ax=ax[5])
sns.barplot(x='False Positive Rate', y='Classifier', data=ml_results, palette='plasma', ax=ax[6])
sns.barplot(x='Error Rate', y='Classifier', data=ml_results, palette='plasma', ax=ax[7])
plt.tight_layout()
plt.show()
ml_results.to_csv('ml_results_final_results.csv')
Genesis L. Taylor
Github | Linkedin | Tableau | genesisltaylor@gmail.com